Note: There are often multiple ways to answer each question.
nba_free_throws.csv
from https://github.com/kjytay/misc/tree/master/data. (Right click on nba_free_throws.csv
and select “Save Link As…”). Import this dataset into R as the variable df
. Are there columns which need their format changed? (You can read more about the dataset here.)# You can import it using the "Import Dataset" button as well.
# Code below only works if the csv file is in the current working directory.
# game_id should probably have character type
library(readr)
df <- read_csv("nba_free_throws.csv",
col_types = cols(game_id = col_character()))
play
and save the result in df2
.library(dplyr)
df2 <- df %>% filter(season == "2015 - 2016" & playoffs == "regular") %>%
select(-play)
All questions from here are about df2
.
df2 %>%
group_by(player) %>%
summarize(shots_taken = n()) %>%
arrange(desc(shots_taken)) %>%
head(n = 10)
## # A tibble: 10 x 2
## player shots_taken
## <chr> <int>
## 1 James Harden 837
## 2 DeMarcus Cousins 663
## 3 DeMar DeRozan 653
## 4 DeAndre Jordan 619
## 5 Andre Drummond 586
## 6 Russell Westbrook 573
## 7 Andrew Wiggins 565
## 8 Isaiah Thomas 541
## 9 Paul George 528
## 10 Kevin Durant 498
df2 %>%
group_by(player) %>%
summarize(shots_taken = n(), shots_made = sum(shot_made)) %>%
mutate(free_throw_pct = shots_made / shots_taken * 100) %>%
arrange(desc(free_throw_pct)) %>%
head(n = 10)
## # A tibble: 10 x 4
## player shots_taken shots_made free_throw_pct
## <chr> <int> <dbl> <dbl>
## 1 Branden Dawson 1 1 100
## 2 Chris Kaman 3 3 100
## 3 Chuck Hayes 2 2 100
## 4 Damjan Rudez 8 8 100
## 5 Erick Green 2 2 100
## 6 Jarell Eddie 8 8 100
## 7 Jeff Ayres 6 6 100
## 8 Jodie Meeks 4 4 100
## 9 Jordan Farmar 10 10 100
## 10 Keith Appling 2 2 100
df2 %>%
group_by(player) %>%
summarize(shots_taken = n(), shots_made = sum(shot_made)) %>%
filter(shots_taken >= 100) %>%
mutate(free_throw_pct = shots_made / shots_taken * 100) %>%
arrange(desc(free_throw_pct)) %>%
head(n = 10)
## # A tibble: 10 x 4
## player shots_taken shots_made free_throw_pct
## <chr> <int> <dbl> <dbl>
## 1 Stephen Curry 400 363 90.8
## 2 Jamal Crawford 271 245 90.4
## 3 Kevin Durant 498 447 89.8
## 4 Chris Paul 328 294 89.6
## 5 Dirk Nowitzki 280 250 89.3
## 6 Jarrett Jack 112 100 89.3
## 7 Damian Lillard 464 414 89.2
## 8 Kevin Martin 172 153 89.0
## 9 Kyrie Irving 188 167 88.8
## 10 Eric Gordon 125 111 88.8
summary_df
(only players who took at least 100 free throws). Using summary_df
, make a scatterplot of free throw percentage vs. free throws taken. Set the alpha
value of the points to 0.5
, and draw a blue dashed horizontal line to show the mean free throw percentage across these players. (Hint: For the horizontal line, use geom_abline
.)library(ggplot2)
summary_df <- df2 %>%
group_by(player) %>%
summarize(shots_taken = n(), shots_made = sum(shot_made)) %>%
filter(shots_taken >= 100) %>%
mutate(free_throw_pct = shots_made / shots_taken * 100)
ggplot(summary_df, aes(x = shots_taken, y = free_throw_pct)) +
geom_point(alpha = 0.5) +
geom_abline(slope = 0, intercept = mean(summary_df$free_throw_pct),
linetype = "dashed", col = "blue")
df2
) had the most number of free throws? Save the rows in df2
from that game in df3
.# This can be done more easily by visually inspecting the game_id with most free
# throws, then hardcoding it into the filter. The solution below does this
# programmatically.
id <- pull(df2 %>% group_by(game_id) %>%
summarize(game = unique(game), shots_taken = n()) %>%
arrange(desc(shots_taken)), game_id)[1]
df3 <- df2 %>% filter(game_id == id)
coord_flip()
as a layer to the plot so that the bars are horizontal. Sort the bars such that the longest ones go on top. (Hint: The forcats
package will be helpful here, as will the last example of Section 15.4 of R4DS.)# without sorting of bars
ggplot(df3, aes(x = player)) +
geom_bar() +
coord_flip()
# with sorting of bars
library(forcats)
df3 %>% mutate(player = player %>% fct_infreq() %>% fct_rev()) %>%
ggplot(aes(x = player)) +
geom_bar() +
coord_flip()
The following code joins data from summary_df
to df3
and saves it as df4 (treat it as a magical incantation for now: the left_join()
function is from the dplyr
package):
df4 <- df3 %>% left_join(summary_df, by = "player")
scale_fill_distiller(palette = "RdYlGn", direction = 1)
to give your bars some appropriate colors. Why are some bars grey?# summary_df only contains data for players who attempted at least 100 shots.
# For players with less than 100 shots, the new columns added by the left_join
# are all set to NA.
ggplot(df4, aes(x = player)) +
geom_bar(aes(fill = free_throw_pct)) +
scale_fill_distiller(palette = "RdYlGn", direction = 1) +
coord_flip()
# with sorting of bars
df4 %>% mutate(player = player %>% fct_infreq() %>% fct_rev()) %>%
ggplot(aes(x = player)) +
geom_bar(aes(fill = free_throw_pct)) +
scale_fill_distiller(palette = "RdYlGn", direction = 1) +
coord_flip()
tidyr
’s separate
function, separate the game
column in df2
to a home
column which has the name of the home team, and an away
column which has the name of the away team.library(tidyr)
df2 %>%
separate(game, into = c("home", "away"))
## # A tibble: 57,304 x 11
## end_result home away game_id period player playoffs score season
## <chr> <chr> <chr> <chr> <dbl> <chr> <chr> <chr> <chr>
## 1 106 - 94 DET ATL 400827… 1 Marcu… regular 5 - 4 2015 …
## 2 106 - 94 DET ATL 400827… 1 Marcu… regular 5 - 4 2015 …
## 3 106 - 94 DET ATL 400827… 1 Andre… regular 10 -… 2015 …
## 4 106 - 94 DET ATL 400827… 1 Andre… regular 10 -… 2015 …
## 5 106 - 94 DET ATL 400827… 1 Paul … regular 15 -… 2015 …
## 6 106 - 94 DET ATL 400827… 1 Paul … regular 15 -… 2015 …
## 7 106 - 94 DET ATL 400827… 1 Reggi… regular 23 -… 2015 …
## 8 106 - 94 DET ATL 400827… 2 Al Ho… regular 31 -… 2015 …
## 9 106 - 94 DET ATL 400827… 2 Al Ho… regular 31 -… 2015 …
## 10 106 - 94 DET ATL 400827… 3 Kenta… regular 56 -… 2015 …
## # … with 57,294 more rows, and 2 more variables: shot_made <dbl>,
## # time <time>